package noveiras;

import java.net.URL;
import java.util.ArrayList;

import au.id.jericho.lib.html.Source;

public class RenderToText {
	public static void main(String[] args) throws Exception {
		//String sourceUrlString="data/epij.txt.html";
		//String sourceUrlString="data/rankpep.html";
		//String sourceUrlString="data/syfpeithi.html";
		String sourceUrlString="data/hlabind.html";
		if (args.length==0)
			System.err.println("Using default argument of \""+sourceUrlString+'"');
		else
			sourceUrlString=args[0];

		if (sourceUrlString.indexOf(':')==-1) 
			sourceUrlString="file:"+sourceUrlString;
		
		Source source=new Source(new URL(sourceUrlString));
		String renderedText=source.getRenderer().toString();
		//System.out.println("\nSimple rendering of the HTML document:\n");
		//System.out.println(renderedText);

		String[] linhas = renderedText.split("\n");
		ArrayList<String> l = new ArrayList<String>();
		
		System.out.println(linhas.length);
		
		//removendo linhas vazias
		for (int i=0; i< linhas.length; i++) {
			if (linhas[i].trim().length() > 0)
				l.add(linhas[i]);
		}
		System.out.println(l.size());
		for (int i=0; i< l.size(); i++) {
			String tmp = l.get(i);
			if (tmp.contains("\t")) {
				System.out.print(l.get(i).split("\t").length+"--->"+l.get(i));
			} else
				System.out.print(l.get(i));
		}
	}

}
